import lxml.html
import requests
from lxml import etree


def spider_index():
    s = requests.session()
    t = s.get("https://www.jxnu.edu.cn/51/list.htm").text
    t = t.encode("ISO-8859-1").decode("utf-8")

    etr = etree.HTML(t)

    a_like = etr.xpath("//*[@id='wp_content_w6_0']/p")

    for text in a_like:
        dom_text = lxml.html.tostring(text)
        str_text = dom_text.decode()
        if len(str_text) > 20:
            petr = etree.HTML(str_text)
            school = "江西师范大学"
            school_url = "https://www.jxnu.edu.cn/"
            academy_url = petr.xpath("//a/@href")[0]
            academy = petr.xpath("string()")
            academy = str(academy).rsplit(".")[1].replace(" ", "")
            get_next(school, school_url, academy_url, academy)


def get_next(school, school_url, academy_url, academy):
    print(school, school_url, academy, academy_url)


if __name__ == '__main__':
    spider_index()
